# load packages and data --------------------------------------------------
library(haven)
library(tidyverse)
library(dplyr)
library(psych)
library(stargazer)
library(lme4)
library(performance)
library(nortest)
library("stringr")

library("intsvy")
library("dplyr")
library("ggplot2")
library("tidyr")
library(BIFIEsurvey)

root <- "C:/Users/Wieczorek_W_Station/Dropbox/Projekte/Effektive_Schulsteuerung_PISA/Daten/PISA/"
#root <- "C:/Users/olive/Dropbox/Projekte/Effektive_Schulsteuerung_PISA/Daten/PISA/"
path <- paste0(root,"Aufbereitete_Daten/")
output <- paste0(root,"Output_BIFIE/")
setwd(path)

y <-2015

# Define function calls ---------------------------------------------------

##Output generator
bifie_stats <- function(bifie_output){
stats <- bifie_output$stat

stats <- stats %>%
  select(parameter,est, SE,t,p) %>%
  as_tibble()

stats <- stats %>%
  add_row(parameter = "no Students", est = bifie_output$N)

stats <- stats %>%
  add_row(parameter = "no Schools", est = bifie_output$Nclusters)

stats <- stats %>% 
  filter(substr(parameter,1,4)=="beta" |
           substr(parameter,1,3)=="ICC" |
           substr(parameter,1,2)=="R2" |
           substr(parameter,1,3) =="no "
  )#
return(stats)
}
## Calculate Variance inflation Factor of IVs required for the Regression
## Analysis
vif_calculator <- function(bifie_postesitmation, original_dataframe){
  ##prepare the dataframe
  pred_pre <- bifie_postesitmation %>%
    filter(str_detect(parameter,"beta"))
  ##get the parameters
  parms <-  gsub("beta_","", pred_pre$parameter) %>%
    as.list()
  
  ## Create a dataframe based on the IVs
  ## But: check if an interaction term is present.
  ## Interaction terms contain the ":"-character.
  if(sum(str_count(pattern=":", as.vector(parms))) > 0){  
    ##create a reduced dataframe
    model_df_reduced <- original_dataframe[colnames(original_dataframe) %in% parms]
    model_df_reduced <- model_df_reduced %>%
      mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS * Z_MEAN_ESCS)
  }  else{
    model_df_reduced <- original_dataframe[colnames(original_dataframe) %in% parms]
  }
  
  ##prepare data for further investigation
  Vars <- model_df_reduced %>%
    colnames() #Vector for variable names
  VIF <- c() # empty vector for VIFs
  
  vif_dataframe <- tibble(variables = Vars)
  
  ## If our dataframe got more than one variable, then perform VIF
  ## Otherwise: return NA-values
  if(nrow(vif_dataframe) >1){
    ## generate a negation to filter DV
    `%nin%` = Negate(`%in%`) 
    for(V in Vars){
      DV <- V
      IVS <- as.vector(Vars)
      IVS <- IVS[IVS %nin% DV]
      
      IV_formula <- ""
      checkpos <- length(IVS)
      pos <- 1
      
      for(i in IVS){
        if(pos < checkpos) {
          IV_formula <- paste0(IV_formula,i," + ")
          pos <- pos + 1 
        }
        else{
          IV_formula <- paste0(IV_formula,i)
        }#
      }
      
      ## cerate a formula for calculating the VIF
      m <- as.formula(paste(DV, "~", IV_formula))
      
      ## Regress one IV of the imputated model on other IVs
      vif_model <- summary(lm(m, data = model_df_reduced))
      
      ## calculate VIF as 1/(1-R�j_th variable)
      value <- as.numeric(1/(1-vif_model$r.squared))
      ##show model
      vif_model
      
      ##append VIF to variable
      VIF <- append(VIF,as.numeric(value))
      
    }
    
    ## generate VIF- Dataframe
    vif_dataframe <- vif_dataframe %>%
      mutate(VIF = VIF)
    
    ## Add mean VIF
    vif_dataframe <- vif_dataframe %>%
      add_row(variables = "MEAN_VIF", VIF = mean(VIF))
  } else{
    ## generate VIF- Dataframe
    vif_dataframe <- vif_dataframe %>%
      mutate(VIF = NA)
    
    ## Add mean VIF
    vif_dataframe <- vif_dataframe %>%
      add_row(variables = "MEAN_VIF", VIF = NA)
  }
  return(vif_dataframe)
}

# Calculate Error Terms and Shapiro-Francia Test ---------------------------
## predict values for DV
## try to fetch the error terms for each model
sf_calculator <- function(bifie_postestimation,original_dataframe){
  pred_pre <- bifie_postestimation %>% 
    filter(str_detect(parameter,"beta"))
  
  ## fetch intercept_value
  alpha <- pred_pre %>%
    filter(str_detect(parameter, "Intercept")) %>%
    select(est)
  alpha <- alpha$est
  
  ##get the parameters
  betas <-  gsub("beta_","", pred_pre$parameter) %>%
    as.vector()
  
  # exclude intercept from parameter-List
  pred_pre$parameter <- betas
  pred_pre <- pred_pre %>%
    filter(str_detect(betas, "Intercept", negate= TRUE))
  
  
  
  ## check for interaction terms
  interactions <- betas[str_detect(betas,":")]
  
  ## split interaction terms
  interactions_split <- str_split(interactions, pattern =":")
  
  ## reduce beta-values
  betas <- betas[str_detect(betas, "Intercept", negate= TRUE)]
  betas <-betas[str_detect(betas,interactions, negate=TRUE)]
  
  
  ##add DV-Variables to parameters
  dep ="MEAN"
  dv_parms <- original_dataframe %>%
    select(contains("MEAN") & contains("PV")) %>%
    colnames() 
  
  dv_iv_parms <- append(dv_parms,betas)
  
  model_df_reduced <- original_dataframe %>%
    select(dv_iv_parms)
  
  if(length(interactions > 0)){
    print(paste("interactions present for model:",deparse(substitute(bifie_postestimation))))
    for(i in length(interactions_split)){
      print(interactions_split[[i]])
      interaction_variables = interactions_split[[i]]
      model_df_reduced[paste0(interaction_variables[1],
                              "_",
                              interaction_variables[2])
      ] <- model_df_reduced[interaction_variables[1]] * 
        model_df_reduced[interaction_variables[2]]
      
      ## add interaction term to betas
      
      betas <- append(betas,paste0(interaction_variables[1],
                                   "_",
                                   interaction_variables[2]))
    }
  }
  
  pred_pre$parameter <- str_replace(as.vector(pred_pre$parameter),":","_")
  
  ## calculate beta-values * values given per model
  for(b in betas){
    beta_value <- pred_pre %>%
      filter(parameter == b) %>%
      select(est) %>%
      as.numeric()
    varname_new <- paste0(b,"_predict")
    model_df_reduced[varname_new] <- model_df_reduced[b] * beta_value
    
  }
  model_df_reduced["prediction"] <- alpha + 
    rowSums(
      model_df_reduced[
        str_detect(
          colnames(model_df_reduced), "_predict")
      ]
    )
  ##create an empty list to store error-terms
  
  
  dvs <- model_df_reduced %>%
    select(starts_with("PV") & ends_with("MEAN")) %>%
    colnames()

  ## generate distances over all plausible values
  
  error_count <- 1
  for(d in dvs){
    
    model_df_reduced[
      paste0("ERRORS_",as.character(error_count)
      )
    ] <- model_df_reduced["prediction"] - model_df_reduced[d]
    
    
    error_count <- error_count +1
  }
  
  model_df_reduced["MEAN_ERROR"] <- model_df_reduced %>%
    select(starts_with("ERROR")) %>%
    rowMeans()
  
  errors <- model_df_reduced$MEAN_ERROR
  
  sampling <- NA
  if(length(errors) <=5000){
      sf_testvalue <- sf.test(errors)
      sampling <- "no"
    } else{
      seed = 42
      sf_testvalue <- sf.test(sample(errors, size = 5000))
      sampling <- "yes"
  }
  
  sf_testvalue <- tibble(model = deparse(substitute(bifie_postestimation)),
                         W = sf_testvalue$statistic,
                         P = sf_testvalue$p.value,
                         imputations = length(dvs),
                         obs = nrow(original_dataframe),
                         sampling = sampling)
  
  ##H0 = Variable follows a normal distribution
  ##H1 = Variable does not follow a normal distribution
    if(sf_testvalue$P < 0.05){
      sf_testvalue <- sf_testvalue %>% 
        mutate(normally_distributed = "no")
    }else{
      sf_testvalue<- sf_testvalue %>% 
        mutate(normally_distributed = "yes")
  }

  return(sf_testvalue)
}


# Load Country Dataframe for 2015 -----------------------------------------
#countries <- as.list(unique(countries))

#countries <- c("DEU","GBR","USA","CAN","FIN","SWE", "AUS","CHE","FRK")

countries <- c("KOR", "SGP", "USA", "CAN")

for(country in countries){
  print(paste("currently at:",country))
  
  
  path_year <- paste0(path,"2015","/")
  setwd(path_year)
  
  
  df <- read.csv(paste0(country,"with_indices_2015.csv")) 
  df <- as_tibble(df)
  df <- df %>% select(-starts_with("X"))
  
  ## rename variables
  df <- df %>% rename(isei = bfmj2) %>%
    rename(schoolid = cntschid) %>%
    rename(country = cntryid)

  

  for(i in seq(1,10)){
    df[paste0("pv",as.character(i),"mean")] <- rowMeans(df[c(paste0("pv",as.character(i),"math"),
         paste0("pv",as.character(i),"read"),
         paste0("pv",as.character(i),"scie"))], na.rm = T)
  }


for(i in seq(1,10)){
  df[paste0("pv",as.character(i),"read_scie")] <- rowMeans(df[c(paste0("pv",as.character(i),"read"),
                                                                paste0("pv",as.character(i),"scie"))], na.rm = T)
}
 

# Select Variables relevant for regression analysis -----------------------

model_df <-  df %>%
  select(ends_with("mean"), ends_with("read_scie"), escs , mean_escs ,  mig_2nd , mean_mig , langn,
         disclima , mean_disclima, student_absenteeism , teacher_absenteeism , 
         autonomy , leadership , accountability ,schoolid, w_fstuwt, w_schgrnrabwt,
         cntstuid, schoolid,
         starts_with("w_fsturwt")) %>% 
    na.omit()



# Change Column Names to upper --------------------------------------------
colnames(model_df) <- model_df %>%
  colnames() %>%
  toupper()  



# z-standardize variables -------------------------------------------------
model_df <- model_df %>% mutate(Z_ESCS = ESCS - mean(ESCS) / sd(ESCS),
                                  Z_MEAN_ESCS = MEAN_ESCS - mean(MEAN_ESCS) / sd(MEAN_ESCS),
                                  Z_MEAN_MIG = MEAN_MIG -  mean(MEAN_MIG) / sd(MEAN_MIG),
                                  Z_DISCLIMA = DISCLIMA - mean(DISCLIMA) / sd(DISCLIMA),
                                  Z_MEAN_DISCLIMA = MEAN_DISCLIMA - mean(MEAN_DISCLIMA) / sd(MEAN_DISCLIMA),
                                  Z_TEACHER_ABSEENTEISM = TEACHER_ABSENTEEISM - mean(TEACHER_ABSENTEEISM) / sd(TEACHER_ABSENTEEISM),
                                  Z_STUDENT_ABSEENTEISM = STUDENT_ABSENTEEISM - mean(STUDENT_ABSENTEEISM) / sd(STUDENT_ABSENTEEISM),
                                  Z_AUTONOMY = AUTONOMY - mean(AUTONOMY) / sd(AUTONOMY),
                                  Z_LEADERSHIP = LEADERSHIP -mean(LEADERSHIP) / sd(LEADERSHIP),
                                  Z_ACCOUNTABILITY = ACCOUNTABILITY - mean(ACCOUNTABILITY) / sd(ACCOUNTABILITY) 
  )


# Generate BIFIE-Data -----------------------------------------------------

migration_yes_no <- model_df %>% 
  select(MIG_2ND) %>%
  group_by(MIG_2ND) %>%
  count() 

share_migrants <-  migration_yes_no$n / nrow(model_df)

# Generate BIFIE-Data -----------------------------------------------------


RR <- 80
set.seed(42)
test <- BIFIE.data.jack(as.data.frame(model_df), jktype = "RW_PISA",
                        pv_vars = c("MEAN","READ_SCIE"),
                        wgtrep="W_FSTURWT",
                        pvpre = paste0("PV",1:10),
                        fayfac=1 / RR / ( 1 - .5 )^2 )

bifie_data <- test

if(country == "SGP"){
  
  print("calculate model 0")
  m0 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1, 
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 1")
  m1 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS,
                         formula.random = ~1 ,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 2")
  m2 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_MEAN_ESCS,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)
  
  print("calculate model 3")
  m3 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE) 
  
  print("calculate model 4")
  m4 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + MIG_2ND  + Z_MEAN_MIG + LANGN,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE) 
  
  print("calculate model 5")
  m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_DISCLIMA + Z_MEAN_DISCLIMA + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)  
  
  print("calculate model 6")
  m6 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2="one",
                         maxiter=1000, se = TRUE)  
  
  print("calculate model 9")
  m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                         dep = "READ_SCIE",
                         idcluster =  "SCHOOLID",
                         formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + Z_MEAN_MIG +
                           MIG_2ND + LANGN + 
                           Z_DISCLIMA + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM +
                           Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                         formula.random = ~1,
                         wgtlevel1 <- "W_FSTUWT",
                         wgtlevel2= "one",
                         maxiter=1000, se = TRUE)
}else{
  
  if(share_migrants < 1){
    m0 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1, 
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           wgtlevel2="one",
                           maxiter=1000, se = TRUE)
    
    m1 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS,
                           formula.random = ~1 ,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    m2 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    m3 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    
    m4 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + MIG_2ND  + Z_MEAN_MIG + LANGN,
                           formula.random = ~1 + Z_MEAN_MIG,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_DISCLIMA + 
                             Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  
    
    m6 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  
    
    m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + Z_MEAN_MIG +
                             MIG_2ND + LANGN + 
                             Z_DISCLIMA  + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM+ 
                             Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
  }else{
    m0 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1, 
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           wgtlevel2="one",
                           maxiter=1000, se = TRUE)
    
    m1 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS,
                           formula.random = ~1 ,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    m2 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    m3 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    
    m4 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "MEAN",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE) 
    
    m5 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_DISCLIMA  + 
                             Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  
    
    m6 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)  

    
    m9 <-BIFIE.twolevelreg(BIFIEobj = test,
                           dep = "READ_SCIE",
                           idcluster =  "SCHOOLID",
                           formula.fixed = ~1 + Z_ESCS * Z_MEAN_ESCS + #Z_MEAN_MIG +
                             #MIG_2ND + LANGN + 
                             Z_DISCLIMA  + Z_TEACHER_ABSEENTEISM + Z_STUDENT_ABSEENTEISM+ 
                             Z_AUTONOMY + Z_LEADERSHIP + Z_ACCOUNTABILITY,
                           formula.random = ~1 ,
                           wgtlevel1 <- "W_FSTUWT",
                           maxiter=1000, se = TRUE)
    
    
  }
}


# Generate Outputs --------------------------------------------------------

setwd(output)

##generate statistiscs for export (fixed effects)
stats_m0 <- bifie_stats(m0)
stats_m1 <- bifie_stats(m1)
stats_m2 <- bifie_stats(m2)
stats_m3 <- bifie_stats(m3)
stats_m4 <- bifie_stats(m4)
stats_m5 <- bifie_stats(m5)
stats_m6 <- bifie_stats(m6)
stats_m9 <- bifie_stats(m9)

# Combine statstics -------------------------------------------------------
varnames_statistics <- c()

for(var in stats_m9$parameter){
  varnames_statistics <- append(varnames_statistics,var)
  varnames_statistics <- append(varnames_statistics,paste0(var,"_SD"))
}


export_data <- as_tibble(varnames_statistics) %>% 
  rename("parameter" = "value") %>%
  filter(parameter != "no Students_SD") %>%
  filter(parameter != "no Schools_SD") 

## Create Estimates with stars

models <- list(stats_m0, 
               stats_m1, 
               stats_m2, 
               stats_m3,
               stats_m4, 
               stats_m5, 
               stats_m6, 
               stats_m9)

model_count <- 0
for(MODEL in models){
  
  ## generate model name
  MODEL_NAME <- paste0("Model_",as.character(model_count))
  
  ## generate table with parameter values and p-values
  stats_est <- MODEL %>%
    select(parameter, est,p) %>% 
    mutate(est = as.character(round(est, digits=3)),
           p = round(p, digits=3)) %>%
    mutate(p = case_when(p < 0.001 ~ "***",
                         p < 0.01 ~"**",
                         p < 0.05 ~ "*",
                         p >= 0.05 ~ "",
                         p == NA ~ ""))
  
  stats_est <-stats_est %>%
    unite(est, est,p, na.rm =T, sep="")
  
  
  colnames(stats_est) <- c("parameter", MODEL_NAME)
  
  ## generate standard error estimations
  
  stats_se <- MODEL %>% 
    select(parameter,SE) %>%
    mutate(SE = paste0("(", 
                       as.character(round(SE,digits=3)),
                       ")")) 
  colnames(stats_se) <- c("parameter", MODEL_NAME)
  
  
  ## rename Standard Error parameters
  PARMS <- c()
  for(p in stats_se$parameter){
    PARMS <- append(PARMS, paste0(p,"_SD"))
  }
  stats_se$parameter <- PARMS
  rm(PARMS)
  
  
  export_data <- export_data %>%
    left_join(rbind(stats_se,stats_est), by = c("parameter"))
  model_count <- model_count +1
}



xlsx::write.xlsx(export_data,paste0("2015_sci_reading_",country,".xlsx"),
                 showNA=FALSE)



vif_m1 <- vif_calculator(stats_m1,model_df) %>%
  rename(VIF_m1=VIF)

vif_m2 <- vif_calculator(stats_m2,model_df) %>%
  rename(VIF_m2=VIF)

vif_m3 <- vif_calculator(stats_m3,model_df) %>%
  rename(VIF_m3=VIF)

vif_m4 <- vif_calculator(stats_m4,model_df) %>%
  rename(VIF_m4=VIF)

vif_m5 <- vif_calculator(stats_m5,model_df) %>%
  rename(VIF_m5=VIF)

vif_m6 <- vif_calculator(stats_m6,model_df) %>%
  rename(VIF_m6=VIF)



vif_m9 <- vif_calculator(stats_m9,model_df) %>%
  rename(VIF_m9 = VIF)

vif_dataframe <- vif_m9 %>% 
  left_join(vif_m1) %>%
  left_join(vif_m2) %>%
  left_join(vif_m3) %>%
  left_join(vif_m4) %>%
  left_join(vif_m5) %>%
  left_join(vif_m6) %>%

  relocate(VIF_m9, .after = last_col())


vif_dataframe <-    vif_dataframe %>%
  slice(3,4,nrow(vif_dataframe)-1,1,5,2,6:10, nrow(vif_dataframe))

vars <- vif_dataframe$variables
vif_dataframe <- as.data.frame(lapply(vif_dataframe, function(x) round(as.numeric(x), 3)))
vif_dataframe$variables <- vars


xlsx::write.xlsx(vif_dataframe,paste0(as.character(y),"_sci_reading_",country,"_VIF.xlsx"),
                 showNA=FALSE)

##Calculate Shapiro-Francia-Tests
shapiros <- sf_calculator(stats_m1,model_df) %>%
  add_row(sf_calculator(stats_m2,model_df)) %>%
  add_row(sf_calculator(stats_m3,model_df)) %>%
  add_row(sf_calculator(stats_m4,model_df)) %>%
  add_row(sf_calculator(stats_m5,model_df)) %>%
  add_row(sf_calculator(stats_m6,model_df)) %>%
  add_row(sf_calculator(stats_m9,model_df))
xlsx::write.xlsx(shapiros,paste0(as.character(y),"_sci_reading_",country,"_sf-test.xlsx"),
                 showNA=FALSE)

# Create the resdualplots -------------------------------------------------


## create a dataframe containing the estimators of the full model
estimators_m9 <- stats_m9 %>%
  filter(str_detect(parameter, "beta")) %>%
  select(parameter,est)

#change parameter
estimators_m9$parameter <- estimators_m9 %>%
  pull(parameter) %>%
  str_replace("beta_","") %>%
  str_replace(":","_") %>%
  str_replace_all(c("\\(" = "", "\\)" = ""))

#get parameters list
parameters_list <- estimators_m9 %>%
  pull(parameter) 

## get the data to calculate the residuals
residuals_data <- bifie_data$dat1 %>%
  unique() %>%
  as_tibble() %>%
  mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS * Z_MEAN_ESCS) %>%
  mutate(Z_ESCS_Z_MEAN_ESCS = Z_ESCS_Z_MEAN_ESCS - mean(Z_ESCS_Z_MEAN_ESCS) / sd(Z_ESCS_Z_MEAN_ESCS) ) %>%
  mutate(Intercept = 1)

prediction_step1 <- residuals_data %>%
  select(estimators_m9$parameter)

## multiply each variable by the predictor and sum afterwards
i <- 1
while(i <= length(parameters_list)){
  prediction_step1[parameters_list[i]] <- prediction_step1 %>%
    select(parameters_list[i]) *
    estimators_m9 %>%
    filter(parameter == parameters_list[i]) %>%
    pull(est)
  
  print(paste(parameters_list[i],
              i,
              head(prediction_step1[parameters_list[i]]))
  )
  
  i <- i + 1
}

## make a rowwise dataframe
prediction_step2 <- prediction_step1 %>%
  mutate(id = sequence(1,nrow(prediction_step1))) %>%
  rowwise(id)
prediction_step2 <- prediction_step2 %>%
  mutate(predict = sum(c_across()))


## attach the imputed mean reading and science scores
prediction_step2["READ_SCIE"] <- residuals_data %>%
  pull(READ_SCIE)

## calculate the differences
prediction_step2 <- prediction_step2 %>%
  mutate(pred_diff = READ_SCIE - predict)


## create a plot with measured versus predicted values
predict_plot <- ggplot(prediction_step2, aes(x=READ_SCIE,y=predict)) +
  geom_point(alpha = 0.3, color="steelblue") + 
  stat_density_2d(aes(fill = ..level..), geom="polygon", alpha = 0.3) +
  geom_smooth(method=lm, color = "black")

## add a heat map by kernel density plot
predict_plot <- predict_plot + 
  xlab("Measured: imputed PISA mean scores of science and reading") +
  ylab("Predicted: imputed PISA mean scores\nof science and reading") +
  ggtitle(paste("Full model:",country,"2015")) +
  theme(plot.title = element_text(hjust = 0.5))

## create a histogram with the distribution of residuals of the full model
histogram <- ggplot(prediction_step2, aes(x = pred_diff,
                                          y =..density..)) +
  geom_histogram(binwidth = 20, 
                 fill = "steelblue",
                 color = "black",
                 alpha = 0.5) +
  geom_density(fill = "steelblue",
               color = "black",
               alpha = 0.5) +
  geom_vline(xintercept = 0, linetype=2)

histogram <- histogram + 
  xlab("Residuals: imputed PISA mean scores of science and reading") + 
  ggtitle(paste("Full model:",country,"2015")) +
  theme(plot.title = element_text(hjust = 0.5))

## save the plots
ggsave(plot = predict_plot, 
       filename = paste0("prediction_plot_full_model",country,"_2015.png"),
       dpi = 300,
       units= "cm",
       width = 15,
       height = 10)

ggsave(plot = histogram, 
       filename = paste0("residuals_histogram_full_model",country,"_2015.png"),
       dpi = 300,
       units= "cm",
       width = 15,
       height = 10)
}









